Vast Mini Challenge 2 - Answer

Jovinka Hartanto
7/13/2021

Question 1

By observing loyalty and credit card data, we can observe that the top three most visited places during the 2 weeks data are:

  1. Katerina’s Cafe Employee usually go to Katerina’s Cafe on between

  2. Hippokampos Employee usually go to Hippokampos on between

  3. Guy’s Gyros Employee usually go to Guy’s Gyros Cafe on between

##Anomalies

  1. Mismatch between credit card and loyalty card transactions

As we can see in the figure below, the number of daily frequency from credit card data and loyalty data are different. Based on credit card data the Katerina’s Cafe is most visited in 2014-06-01 with total of … transactions. However, if we observed loyalty data Katerina’s Cafe is most visited in … with total of … transactions. This problem also happened to other shops or location that we are observing.

There are total of … un-matched records. These un-matched records might lead to some new clues.

  1. Wrong time in credit card transaction data Bean There Done That Brewed Awakenings Jack’s Magical Beans

To correct this anomaly, we can add vehicle data to our analysis, we might be able to determine some of the transactions based on the employee location.

cc_freq <- group_by(cc_data, location) %>% summarise(freq = n())%>% arrange(desc(freq))
cc_freq
# A tibble: 34 x 2
   location              freq
   <chr>                <int>
 1 Katerina's Cafe        212
 2 Hippokampos            171
 3 Guy's Gyros            158
 4 Brew've Been Served    156
 5 Hallowed Grounds        92
 6 Ouzeri Elian            87
 7 Abila Zacharo           72
 8 Gelatogalore            64
 9 Kalami Kafenion         64
10 Bean There Done That    47
# ... with 24 more rows
loyalty_freq <- group_by(loyalty_data, location) %>% summarise(freq = n())%>% arrange(desc(freq))
loyalty_freq
# A tibble: 33 x 2
   location              freq
   <chr>                <int>
 1 Katerina's Cafe        195
 2 Hippokampos            155
 3 Guy's Gyros            146
 4 Brew've Been Served    140
 5 Ouzeri Elian            84
 6 Hallowed Grounds        80
 7 Abila Zacharo           71
 8 Kalami Kafenion         66
 9 Gelatogalore            60
10 Bean There Done That    45
# ... with 23 more rows
cc_freq <- group_by(cc_data, location, day, hours) %>% summarise(freq = n())%>% arrange(desc(freq))
cc_freq
# A tibble: 377 x 4
# Groups:   location, day [162]
   location            day       hours  freq
   <chr>               <chr>     <int> <int>
 1 Brew've Been Served Tuesday       7    22
 2 Brew've Been Served Wednesday     7    20
 3 Hippokampos         Thursday     13    20
 4 Brew've Been Served Monday        7    18
 5 Brew've Been Served Thursday      7    18
 6 Katerina's Cafe     Tuesday      20    18
 7 Katerina's Cafe     Saturday     19    17
 8 Brew've Been Served Friday        8    16
 9 Hallowed Grounds    Wednesday     7    16
10 Brew've Been Served Friday        7    15
# ... with 367 more rows
loyalty_freq <- group_by(loyalty_data, location) %>% summarise(freq = n())%>% arrange(desc(freq))
loyalty_freq
# A tibble: 33 x 2
   location              freq
   <chr>                <int>
 1 Katerina's Cafe        195
 2 Hippokampos            155
 3 Guy's Gyros            146
 4 Brew've Been Served    140
 5 Ouzeri Elian            84
 6 Hallowed Grounds        80
 7 Abila Zacharo           71
 8 Kalami Kafenion         66
 9 Gelatogalore            60
10 Bean There Done That    45
# ... with 23 more rows
cc_freq_daily <- group_by(cc_data, timestamp_date, location) %>% summarise(freq = n())%>% arrange(desc(freq))
top3_place = c("Katerina's Cafe", "Hippokampos","Guy's Gyros")

cc_freq_daily_selected <- cc_freq_daily %>% filter(location %in% top3_place)

top3_vis_cc <- ggplot(cc_freq_daily_selected, aes(fill=as.character(timestamp_date), y=freq, x=location)) + 
    geom_bar(position="stack", stat="identity") +
    geom_text(label = cc_freq_daily_selected$freq, position=position_stack(vjust=0.5))+
    ggtitle("CC data top 3 location")

ggplotly(top3_vis_cc)
loyalty_freq_daily <- group_by(loyalty_data, timestamp, location) %>% summarise(freq = n())%>% arrange(desc(freq))
top3_place = c("Katerina's Cafe", "Hippokampos","Guy's Gyros")

loyalty_freq_daily_selected <- loyalty_freq_daily %>% filter(location %in% top3_place)

top3_vis_L <- ggplot(loyalty_freq_daily_selected, aes(fill=as.character(timestamp), y=freq, x=location)) + 
    geom_bar(position="stack", stat="identity") +
    geom_text(label = loyalty_freq_daily_selected$freq, position=position_stack(vjust=0.5))+
    ggtitle("Loyalty data top 3 location")

ggplotly(top3_vis_L)

##Question 2

gps_data <- read_csv("MC2/gps.csv")

gps_data$Timestamp_date <- date_time_parse(gps_data$Timestamp,
                                zone = "",
                                format = "%m/%d/%Y")

gps_data$Timestamp <- date_time_parse(gps_data$Timestamp,
                                zone = "",
                                format = "%m/%d/%Y %H:%M")
gps_data$Time <- format(gps_data$Timestamp, format = "%H:%M")
gps_data$Hour <- get_hour(gps_data$Timestamp)


gps_sort <- gps_data[order(gps_data$Timestamp, gps_data$id),]

stop_fin_2 = data.frame()

for (id_1 in unique(gps_sort$id))
{
  make_stop = data.frame()
  gps_id <- gps_sort %>%
  filter(id==id_1)
  
  for(i in 1:nrow(gps_id))
  {
    if(i < nrow(gps_id))
    {
      start <- gps_id$Timestamp[i]
      end <- gps_id$Timestamp[i+1]
    } else
    {
      start <- gps_id$Timestamp[i]
      end <- gps_id$Timestamp[i]
      print(start)
    }
    elapsed.time <- start%--%end
    time_duration <- as.duration(elapsed.time)
    
    if(time_duration > 300)
    {
      make_stop = rbind(make_stop,cbind(gps_id[i,],time_duration,i),cbind(gps_id[i+1,],time_duration,i))
    }
    
  }
  
  stop_fin_2 = rbind(stop_fin_2, make_stop)
}
[1] "2014-01-19 20:07:00 +08"
[1] "2014-01-19 16:06:00 +08"
[1] "2014-01-19 19:48:00 +08"
[1] "2014-01-19 19:47:00 +08"
[1] "2014-01-19 20:11:00 +08"
[1] "2014-01-19 19:47:00 +08"
[1] "2014-01-19 20:24:00 +08"
[1] "2014-01-19 15:57:00 +08"
[1] "2014-01-19 19:53:00 +08"
[1] "2014-01-19 20:21:00 +08"
[1] "2014-01-19 20:38:00 +08"
[1] "2014-01-19 19:58:00 +08"
[1] "2014-01-19 20:02:00 +08"
[1] "2014-01-18 15:52:00 +08"
[1] "2014-01-19 19:49:00 +08"
[1] "2014-01-19 14:11:00 +08"
[1] "2014-01-19 19:56:00 +08"
[1] "2014-01-19 18:25:00 +08"
[1] "2014-01-19 19:49:00 +08"
[1] "2014-01-19 20:09:00 +08"
[1] "2014-01-19 20:20:00 +08"
[1] "2014-01-18 20:21:00 +08"
[1] "2014-01-19 20:27:00 +08"
[1] "2014-01-19 14:32:00 +08"
[1] "2014-01-19 20:03:00 +08"
[1] "2014-01-16 17:50:00 +08"
[1] "2014-01-19 20:38:00 +08"
[1] "2014-01-19 19:46:00 +08"
[1] "2014-01-19 20:04:00 +08"
[1] "2014-01-19 20:56:00 +08"
[1] "2014-01-19 20:36:00 +08"
[1] "2014-01-19 20:11:00 +08"
[1] "2014-01-19 19:54:00 +08"
[1] "2014-01-19 19:46:00 +08"
[1] "2014-01-19 20:25:00 +08"
[1] "2014-01-17 17:21:00 +08"
[1] "2014-01-16 21:06:00 +08"
[1] "2014-01-16 19:07:00 +08"
[1] "2014-01-16 20:00:00 +08"
[1] "2014-01-19 19:50:00 +08"
elapsed.time
[1] 2014-01-19 19:50:00 +08--2014-01-19 19:50:00 +08
location_point = read_csv("MC2/stop_fin.csv") %>% select(lat,long,Possible_Location)%>% unique()
stop_fin_2$lat = round(stop_fin_2$lat,5)
stop_fin_2$long = round(stop_fin_2$long,5)

stop_final = merge(stop_fin_2,location_point, by.x = c("lat","long"), by.y = c("lat","long"), all.x = TRUE)

stop_fin <- stop_final


stop_fin_location <- stop_fin[!(stop_fin$Possible_Location %like% 'mice'),]

gps_sf_stopfin <- st_as_sf(stop_fin_location,
                   coords = c("long","lat"),
                   crs=4326)
gps_sf_stopfin_unique1 <- gps_sf_stopfin  %>% select(Possible_Location,geometry)%>%unique()%>%arrange(Possible_Location)

gps_sf_stopfin_unique <- gps_sf_stopfin_unique1[-c(221,476,644,1100,1103),]

gps_path_stopfin <- gps_sf_stopfin_unique %>%
  group_by(Possible_Location) %>%
  summarize(m_Timestamp = mean(geometry),
            do_union = FALSE ) %>%
  st_cast("LINESTRING")


Abila_st <- st_read(dsn = "MC2/Geospatial",
                    layer = "Abila")
Reading layer `Abila' from data source 
  `C:\jovinkahartanto\assignment_distill\MC2\Geospatial' 
  using driver `ESRI Shapefile'
Simple feature collection with 3290 features and 9 fields
Geometry type: LINESTRING
Dimension:     XY
Bounding box:  xmin: 24.82401 ymin: 36.04502 xmax: 24.90997 ymax: 36.09492
Geodetic CRS:  WGS 84
P <- npts(Abila_st, by_feature = TRUE)
abila_st_2 <- cbind(Abila_st,P) %>% filter(P > 1)
bgmap <- raster("MC2/Geospatial/MC2-tourist.TIF")

tmap_mode("view")
tm_shape(bgmap) +
  tm_rgb(bgmap, r = 1, g = 2, b = 3,
         alpha = NA,
         saturation = 1,
         interpolate = TRUE,
         max.value = 255) +
  tm_shape(abila_st_2) +
  tm_lines(col = "red", scale = 1)+
  tm_shape(gps_path_stopfin[9,]) +
  tm_lines(col = "blue", scale =4, interactive = TRUE) +
  tm_text("Possible_Location", size = 3, remove.overlap = TRUE, overwrite.lines = TRUE, just = "top")
gps_data <- read_csv("MC2/gps.csv")

gps_data$Timestamp_date <- date_time_parse(gps_data$Timestamp,
                                zone = "",
                                format = "%m/%d/%Y")

gps_data$Timestamp <- date_time_parse(gps_data$Timestamp,
                                zone = "",
                                format = "%m/%d/%Y %H:%M")
gps_data$Time <- format(gps_data$Timestamp, format = "%H:%M")
gps_data$Hour <- get_hour(gps_data$Timestamp)


gps_sf <- st_as_sf(gps_data,
                   coords = c("long","lat"),
                   crs=4326)

gps_path <- gps_sf %>%
  group_by(id, Timestamp_date,Hour) %>%
  summarize(m_Timestamp = mean(Timestamp_date),
            do_union = FALSE ) %>%
  st_cast("LINESTRING")


Abila_st <- st_read(dsn = "MC2/Geospatial",
                    layer = "Abila")
Reading layer `Abila' from data source 
  `C:\jovinkahartanto\assignment_distill\MC2\Geospatial' 
  using driver `ESRI Shapefile'
Simple feature collection with 3290 features and 9 fields
Geometry type: LINESTRING
Dimension:     XY
Bounding box:  xmin: 24.82401 ymin: 36.04502 xmax: 24.90997 ymax: 36.09492
Geodetic CRS:  WGS 84
P <- npts(Abila_st, by_feature = TRUE)
abila_st_2 <- cbind(Abila_st,P) %>% filter(P > 1)
gps_path_selected <- gps_path %>%
  filter(id==28)


bgmap <- raster("MC2/Geospatial/MC2-tourist.TIF")

tmap_mode("view")
tm_shape(bgmap) +
  tm_rgb(bgmap, r = 1, g = 2, b = 3,
         alpha = NA,
         saturation = 1,
         interpolate = TRUE,
         max.value = 255) +
  tm_shape(abila_st_2) +
  tm_lines(col = "red", scale = 1)+
  tm_shape(gps_path_selected[1:4,]) +
  tm_lines(col = "black", scale =4, interactive = TRUE) +
  tm_text("Hour", size = 3, remove.overlap = TRUE, overwrite.lines = TRUE, just = "top")